# Turn off scientific number
options(scipen = 999)
# Load libraries
library(tseries)
library(ggplot2)
library(fpp)
library(forecast)
library(tsoutliers)
#library(arfima)
library(TSA)
library(fracdiff)
# Load dataset
data <- read.csv("Chicago_Taxi_Trips_by_Hour.csv")
# Display dataset
head(data, 24)
## trip_date trip_hour trip_count
## 1 2022-04-01 0 389
## 2 2022-04-01 1 232
## 3 2022-04-01 2 132
## 4 2022-04-01 3 80
## 5 2022-04-01 4 91
## 6 2022-04-01 5 179
## 7 2022-04-01 6 331
## 8 2022-04-01 7 588
## 9 2022-04-01 8 760
## 10 2022-04-01 9 898
## 11 2022-04-01 10 1014
## 12 2022-04-01 11 1132
## 13 2022-04-01 12 1230
## 14 2022-04-01 13 1225
## 15 2022-04-01 14 1262
## 16 2022-04-01 15 1318
## 17 2022-04-01 16 1244
## 18 2022-04-01 17 1420
## 19 2022-04-01 18 1448
## 20 2022-04-01 19 1319
## 21 2022-04-01 20 1043
## 22 2022-04-01 21 851
## 23 2022-04-01 22 682
## 24 2022-04-01 23 588
# Select desired time range
start_date <- as.Date("2023-02-01")
end_date <- as.Date("2023-02-28")
data_daily <- subset(data, trip_date >= start_date & trip_date <= end_date)
ts_hourly_24 <- ts(data_daily$trip_count, frequency=24)
plot(ts_hourly_24, main = "Hourly Taxi Trips (April, 2023)", ylab = "Number of Taxi Trips")
ggtsdisplay(ts_hourly_24, main = "ACF and PACF of Dataset")
max(ts_hourly_24)
## [1] 1782
min(ts_hourly_24)
## [1] 27
# Split the dataset into a training dataset and a test dataset
train <- window(ts_hourly_24, start=c(1, 1), end=c(27, 24))
test <- window(ts_hourly_24, start=c(28, 1), end=c(28, 24))
# Plot the ACF and PACF of the training dataset
ggtsdisplay(train, main = "ACF and PACF of Training Dataset")
# Plot the ACF and PACF of the test dataset
ggtsdisplay(test, main = "ACF and PACF of Test Dataset")
# Shapiro-Wilk test for normality
shapiro.test(train)
##
## Shapiro-Wilk normality test
##
## data: train
## W = 0.94843, p-value = 0.00000000000002999
# Transform the data using Box Cox transformation
lambda <- BoxCox.lambda(train)
cat("lambda =", lambda)
## lambda = -0.2866391
# Apply Box-Cox transformation to the data
train_boxcox <- BoxCox(train, lambda=lambda)
# Plot the transformed dataset
plot(train_boxcox, main = "Hourly Taxi Trips (April, 2023) - Box-Cox Transformed", ylab = "Number of Taxi Trips")
The Shapiro-Wilk test for normality on the training dataset gives a p-value that is less than the significance level of 0.05. Therefore, we reject the null hypothesis of normality and conclude that the data are not normally distributed.
Additionally, the BoxCox.lambda() function gives a lambda value of 0.624649. This indicates that a Box-Cox transformation with lambda=0.624649 could be applied to the data to achieve approximate normality.
Based on these results, it appears that Box-Cox transformation could be necessary to achieve normality in the data.
# Fit an ARIMA model to the data
fit_arima <- auto.arima(train, lambda="auto", seasonal=FALSE)
# Print the model summary
summary(fit_arima)
## Series: train
## ARIMA(5,0,1) with non-zero mean
## Box Cox transformation: lambda= -0.2866373
##
## Coefficients:
## ar1 ar2 ar3 ar4 ar5 ma1 mean
## 2.4561 -2.1812 0.6170 0.2321 -0.1743 -0.8662 2.8618
## s.e. 0.0433 0.1091 0.1359 0.1062 0.0403 0.0214 0.0055
##
## sigma^2 = 0.002722: log likelihood = 995.66
## AIC=-1975.33 AICc=-1975.1 BIC=-1939.54
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 26.59889 134.2236 91.03056 -1.327727 16.72024 0.653074 0.4830053
# Make forecasts for the next 12 months
fc_arima <- forecast(fit_arima, h=24)
# Plot forecast value
plot(fc_arima)
# Check residuals from a time series model
checkresiduals(fit_arima)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(5,0,1) with non-zero mean
## Q* = 278.94, df = 42, p-value < 0.00000000000000022
##
## Model df: 6. Total lags used: 48
eacf(train)
## AR/MA
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13
## 0 x x x x x o x x x x x x x x
## 1 x x x x x x x x x x x x x x
## 2 x x x o x o o o x x o x o x
## 3 x x x o x o o o o x o x x o
## 4 x x o x o o o o o o o x o o
## 5 x o o x o o o o o o o x o o
## 6 x o x x o o o o o o o o o o
## 7 x x x x o x x o o o x o o o
#AIC(Arima(train, order = c(4,0,6), seasonal = c(2,1,0))) #7023.799
AIC(Arima(train, order = c(3,0,5), seasonal = c(2,1,0))) #7019.807
## [1] 7319.957
#AIC(Arima(train, order = c(3,0,6), seasonal = c(2,1,0))) #7021.794
#AIC(Arima(train, order = c(4,0,1), seasonal = c(2,1,0))) #7032.476
#AIC(Arima(train, order = c(2,0,2), seasonal = c(2,1,0)))
fit_sarima_manual <- Arima(train, order = c(3,0,5), seasonal = c(2,1,0))
summary(fit_sarima_manual)
## Series: train
## ARIMA(3,0,5)(2,1,0)[24]
##
## Coefficients:
## ar1 ar2 ar3 ma1 ma2 ma3 ma4 ma5 sar1
## -0.1225 -0.0763 0.7742 1.2820 1.2407 0.2778 0.0107 0.0306 -0.2153
## s.e. 0.0411 0.0395 0.0377 0.0606 0.1009 0.1236 0.0908 0.0496 0.0469
## sar2
## -0.2327
## s.e. 0.0410
##
## sigma^2 = 7052: log likelihood = -3648.98
## AIC=7319.96 AICc=7320.39 BIC=7368.75
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.07434939 81.7435 59.20618 -1.997507 14.64697 0.4247586
## ACF1
## Training set 0.001904293
fc_sarima_manual <- forecast(fit_sarima_manual, h=24)
plot(fc_sarima_manual)
checkresiduals(fc_sarima_manual)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(3,0,5)(2,1,0)[24]
## Q* = 133.02, df = 38, p-value = 0.000000000001797
##
## Model df: 10. Total lags used: 48
# Fit a SARIMA model to the data
fit_sarima <- auto.arima(train, lambda="auto", seasonal=TRUE)
# Print the model summary
summary(fit_sarima)
## Series: train
## ARIMA(4,0,2)(2,1,0)[24] with drift
## Box Cox transformation: lambda= -0.2866373
##
## Coefficients:
## ar1 ar2 ar3 ar4 ma1 ma2 sar1 sar2
## 0.7836 0.0919 -0.1884 -0.0577 0.3207 -0.0239 -0.4532 -0.2750
## s.e. 0.0505 0.0274 0.1016 0.0557 0.0615 0.0633 0.0393 0.0216
## drift
## 0.0001
## s.e. 0.0002
##
## sigma^2 = 0.002098: log likelihood = 1039
## AIC=-2058 AICc=-2057.64 BIC=-2013.64
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 6.283814 87.19842 65.5456 -1.376294 13.57846 0.470239 0.2457598
# Make forecasts for the next 12 months
fc_sarima <- forecast(fit_sarima, h=24)
# Plot forecast value
plot(fc_sarima)
checkresiduals(fc_sarima)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(4,0,2)(2,1,0)[24] with drift
## Q* = 124.19, df = 40, p-value = 0.0000000001481
##
## Model df: 8. Total lags used: 48
# Fit an ETS model to the train data
fit_ets <- ets(train)
# Print the model summary
summary(fit_ets)
## ETS(A,N,A)
##
## Call:
## ets(y = train)
##
## Smoothing parameters:
## alpha = 0.9999
## gamma = 0.0001
##
## Initial states:
## l = 699.771
## s = -281.2529 -194.2293 -84.3107 87.4996 362.9293 484.995
## 508.1084 464.7836 396.7107 379.0918 372.0103 329.7809 238.4206 204.8213 222.0824 151.4105 -117.2581 -387.1846 -520.9475 -572.211 -586.873 -559.3244 -491.1973 -407.8557
##
## sigma: 80.2733
##
## AIC AICc BIC
## 9906.072 9908.510 10026.867
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.1150181 78.64639 57.05006 -0.4428367 15.37159 0.4092902
## ACF1
## Training set 0.2584438
# Make forecasts for the next 12 months
fc_ets <- forecast(fit_ets, h = 24)
# Plot forecast value
plot(fc_ets)
# Check residuals from a time series model
checkresiduals(fit_ets)
##
## Ljung-Box test
##
## data: Residuals from ETS(A,N,A)
## Q* = 358.4, df = 48, p-value < 0.00000000000000022
##
## Model df: 0. Total lags used: 48
eacf(train_boxcox)
## AR/MA
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13
## 0 x x x x o x x x x x x x x x
## 1 x x x x o x x x x x x x x x
## 2 x o x x x o o o o o o o o o
## 3 x o o o o x o o o o o o o o
## 4 x x o o o o o o o o o o o o
## 5 x x x o o o o o o o o o o o
## 6 x x o x o o o o o o o o o o
## 7 x x x x x o o o o o o o o o
fit_arfima <- arfima(train)
print(fit_arfima)
##
## Call:
## arfima(y = train)
##
## *** Warning during (fracdf) fit: C fracdf() optimization failure
##
## *** Warning during (fdcov) fit: unable to compute correlation matrix; maybe change 'h'
##
## Coefficients:
## d ar.ar1 ar.ar2 ma.ma1 ma.ma2 ma.ma3
## 0.38889066 0.01183326 0.56951412 -1.31938242 -0.61291727 -0.13303361
## ma.ma4
## 0.02618049
## sigma[eps] = 87.8429
## a list with components:
## [1] "log.likelihood" "n" "msg" "d"
## [5] "ar" "ma" "covariance.dpq" "fnormMin"
## [9] "sigma" "stderror.dpq" "correlation.dpq" "h"
## [13] "d.tol" "M" "hessian.dpq" "length.w"
## [17] "residuals" "fitted" "call" "x"
## [21] "series"
fc_arfima <- forecast(fit_arfima, h=24)
plot(fc_arfima)
# varvefd <- arfima(train_boxcox)
# summary(varvefd)
# d <- summary(varvefd)$coef[[1]][1]
# d
# p_values <- 0:1 # Autoregressive order
# q_values <- 0:7 # Moving average order
#
# best_order <- c(p = NA, d = d, q = NA)
# best_aic <- Inf
#
# for (p in p_values) {
# for (q in q_values) {
# fit_model <- arima(train_boxcox, order = c(p, d, q), method = "ML", include.mean = TRUE)
# aic <- AIC(fit_model)
#
# if (aic < best_aic) {
# best_aic <- aic
# best_order <- c(p = p, d = d, q = q)
# }
# }
# }
#
#
# cat("Best ARFIMA order is", best_order, "with lowest AIC =", best_aic)
# # Fit an ARFIMA model to the train data
# fit_arfima <- arima(train_boxcox, order = c(1,d,7), method = "ML", include.mean = TRUE)
#
# summary(fit_arfima)
#
# fc_arfima <- predict(fit_arfima, h = 24)
# plot(fc_arfima$pred)
#
# #fc_arfima_inverse <- InvBoxCox(fc_arfima$pred, lambda)
# Check residuals from a time series model
checkresiduals(fit_arfima)
##
## Ljung-Box test
##
## data: Residuals
## Q* = 589.6, df = 48, p-value < 0.00000000000000022
##
## Model df: 0. Total lags used: 48
# Initialize variables to store best k and AICc
best_k <- Inf
best_aicc <- Inf
for (i in 1:12) {
# Fit ARFIMA model
fit_model <- auto.arima(train, xreg=fourier(train, K=i), seasonal=FALSE, lambda="auto")
# Calculate AIC
aicc <- fit_model$aicc
# Check if AIC is lower than the current best AIC
if (aicc < best_aicc) {
best_k <- i
best_aicc <- aicc
}
}
# Print the best order and AIC
cat("Best dynamic harmonic regression K =", best_k, "with lowest AICc =", best_aicc)
## Best dynamic harmonic regression K = 7 with lowest AICc = -2352.505
# Fit a dynamic harmonic regression model to the train data
harmonics <- fourier(train, K=best_k)
fit_fourier <- auto.arima(train, xreg=harmonics, seasonal=FALSE, lambda="auto")
# Print the model summary
print(fit_fourier)
## Series: train
## Regression with ARIMA(3,1,1) errors
## Box Cox transformation: lambda= -0.2866373
##
## Coefficients:
## ar1 ar2 ar3 ma1 S1-24 C1-24 S2-24 C2-24
## 1.1384 -0.3041 -0.1041 -0.9883 -0.1936 -0.1269 -0.1036 0.0462
## s.e. 0.0395 0.0585 0.0398 0.0066 0.0085 0.0085 0.0080 0.0080
## S3-24 C3-24 S4-24 C4-24 S5-24 C5-24 S6-24 C6-24 S7-24
## 0.0102 0.0546 0.0284 0.0120 0.0119 0.0007 0.0059 -0.0048 -0.0011
## s.e. 0.0047 0.0047 0.0028 0.0028 0.0019 0.0019 0.0015 0.0015 0.0012
## C7-24
## -0.0035
## s.e. 0.0012
##
## sigma^2 = 0.001488: log likelihood = 1195.86
## AIC=-2353.72 AICc=-2352.5 BIC=-2268.74
# Make forecasts for the next 12 months
fc_fourier <- forecast(fit_fourier, xreg=fourier(train, best_k, 24))
# Plot forecast value
plot(fc_fourier)
# Check residuals from a time series model
checkresiduals(fit_fourier)
##
## Ljung-Box test
##
## data: Residuals from Regression with ARIMA(3,1,1) errors
## Q* = 100.42, df = 44, p-value = 0.000002679
##
## Model df: 4. Total lags used: 48
# Fit an TBATS model to the train data
fit_tbats <- tbats(train, use.box.cox=TRUE, use.arma.errors=TRUE)
# Print the model summary
summary(fit_tbats)
## Length Class Mode
## lambda 1 -none- numeric
## alpha 1 -none- numeric
## beta 0 -none- NULL
## damping.parameter 0 -none- NULL
## gamma.one.values 1 -none- numeric
## gamma.two.values 1 -none- numeric
## ar.coefficients 2 -none- numeric
## ma.coefficients 4 -none- numeric
## likelihood 1 -none- numeric
## optim.return.code 1 -none- numeric
## variance 1 -none- numeric
## AIC 1 -none- numeric
## parameters 2 -none- list
## seed.states 23 -none- numeric
## fitted.values 648 ts numeric
## errors 648 ts numeric
## x 14904 -none- numeric
## seasonal.periods 1 -none- numeric
## k.vector 1 -none- numeric
## y 648 ts numeric
## p 1 -none- numeric
## q 1 -none- numeric
## call 4 -none- call
## series 1 -none- character
## method 1 -none- character
# Make forecasts for the next 12 months
fc_tbats <- forecast(fit_tbats, h = 24)
# Plot forecast value
plot(fc_tbats)
# Check residuals from a time series model
checkresiduals(fit_tbats)
##
## Ljung-Box test
##
## data: Residuals from TBATS
## Q* = 108.36, df = 48, p-value = 0.000001467
##
## Model df: 0. Total lags used: 48
all_models <- list(fc_arima, fc_sarima, fc_sarima_manual, fc_ets, fc_arfima, fc_tbats, fc_fourier)
accuracy_results <- lapply(all_models, function(model) accuracy(as.numeric(test), as.numeric(model$mean)))
lapply(all_models, checkresiduals)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(5,0,1) with non-zero mean
## Q* = 278.94, df = 42, p-value < 0.00000000000000022
##
## Model df: 6. Total lags used: 48
##
## Ljung-Box test
##
## data: Residuals from ARIMA(4,0,2)(2,1,0)[24] with drift
## Q* = 124.19, df = 40, p-value = 0.0000000001481
##
## Model df: 8. Total lags used: 48
##
## Ljung-Box test
##
## data: Residuals from ARIMA(3,0,5)(2,1,0)[24]
## Q* = 133.02, df = 38, p-value = 0.000000000001797
##
## Model df: 10. Total lags used: 48
##
## Ljung-Box test
##
## data: Residuals from ETS(A,N,A)
## Q* = 358.4, df = 48, p-value < 0.00000000000000022
##
## Model df: 0. Total lags used: 48
##
## Ljung-Box test
##
## data: Residuals from ARFIMA(2,0.39,4)
## Q* = 589.6, df = 48, p-value < 0.00000000000000022
##
## Model df: 0. Total lags used: 48
##
## Ljung-Box test
##
## data: Residuals from TBATS(0, {2,4}, -, {<24,8>})
## Q* = 108.36, df = 48, p-value = 0.000001467
##
## Model df: 0. Total lags used: 48
##
## Ljung-Box test
##
## data: Residuals from Regression with ARIMA(3,1,1) errors
## Q* = 100.42, df = 44, p-value = 0.000002679
##
## Model df: 4. Total lags used: 48
## [[1]]
##
## Ljung-Box test
##
## data: Residuals from ARIMA(5,0,1) with non-zero mean
## Q* = 278.94, df = 42, p-value < 0.00000000000000022
##
##
## [[2]]
##
## Ljung-Box test
##
## data: Residuals from ARIMA(4,0,2)(2,1,0)[24] with drift
## Q* = 124.19, df = 40, p-value = 0.0000000001481
##
##
## [[3]]
##
## Ljung-Box test
##
## data: Residuals from ARIMA(3,0,5)(2,1,0)[24]
## Q* = 133.02, df = 38, p-value = 0.000000000001797
##
##
## [[4]]
##
## Ljung-Box test
##
## data: Residuals from ETS(A,N,A)
## Q* = 358.4, df = 48, p-value < 0.00000000000000022
##
##
## [[5]]
##
## Ljung-Box test
##
## data: Residuals from ARFIMA(2,0.39,4)
## Q* = 589.6, df = 48, p-value < 0.00000000000000022
##
##
## [[6]]
##
## Ljung-Box test
##
## data: Residuals from TBATS(0, {2,4}, -, {<24,8>})
## Q* = 108.36, df = 48, p-value = 0.000001467
##
##
## [[7]]
##
## Ljung-Box test
##
## data: Residuals from Regression with ARIMA(3,1,1) errors
## Q* = 100.42, df = 44, p-value = 0.000002679
metrics_table <- data.frame(do.call(rbind, accuracy_results))
metrics_table$Model <- c("ARIMA","SARIMA_Auto", "SARIMA_Manual", "ETS", "ARFIMA","Dynamic Harmonic Regression", "TBATS")
metrics_table <- metrics_table[, c("Model", "ME", "RMSE", "MAE", "MPE", "MAPE")]
print(metrics_table)
## Model ME RMSE MAE
## Test.set ARIMA -357.64449 497.4271 410.41048
## Test.set.1 SARIMA_Auto -105.16264 169.1483 131.88928
## Test.set.2 SARIMA_Manual -68.43938 108.6076 79.41854
## Test.set.3 ETS -133.29504 170.7190 138.95600
## Test.set.4 ARFIMA -233.74790 469.8839 417.61335
## Test.set.5 Dynamic Harmonic Regression -56.03499 113.7951 77.86034
## Test.set.6 TBATS -21.47085 102.5660 64.18240
## MPE MAPE
## Test.set -75.22301746 99.70367
## Test.set.1 -5.19455729 22.25136
## Test.set.2 -6.45875717 11.55319
## Test.set.3 -21.46020955 25.85445
## Test.set.4 -36.12321805 78.53504
## Test.set.5 -2.92832838 15.16795
## Test.set.6 0.02603666 13.35608
# Plot the metrics results
forecast_colors <- c("#FFD700", "#0066FF", "#8B4513", "#0000CD", "#FF9900", "#000066", "#A9A9A9")
# Root Mean Squared Error
ggplot(metrics_table, aes(x = Model, y = RMSE, fill = Model)) +
geom_bar(stat = "identity", position = "dodge", show.legend = FALSE) +
labs(x = "Forecast Model", y = "Root Mean Squared Error") +
scale_fill_discrete(name = "Forecast Model") +
theme_light() +
scale_fill_brewer(palette="Set2") +
ggtitle("Root Mean Squared Error")
# Mean Absolute Error
ggplot(metrics_table, aes(x = Model, y = MAE, fill = Model)) +
geom_bar(stat = "identity", position = "dodge", show.legend = FALSE) +
labs(x = "Forecast Model", y = "Mean Absolute Error") +
scale_fill_discrete(name = "Forecast Model") +
theme_light() +
scale_fill_brewer(palette="Set2") +
ggtitle("Mean Absolute Error")
# Mean Absolute Percentage Error
ggplot(metrics_table, aes(x = Model, y = MAPE, fill = Model)) +
geom_bar(stat = "identity", position = "dodge", show.legend = FALSE) +
labs(x = "Forecast Model", y = "Mean Absolute Percentage Error") +
scale_fill_discrete(name = "Forecast Model") +
theme_light() +
scale_fill_brewer(palette="Set2") +
ggtitle("Mean Absolute Percentage Error")
# Mean Absolute Error
ggplot(metrics_table, aes(x = Model, y = MAE, fill = Model)) +
geom_bar(stat = "identity", position = "dodge", show.legend = FALSE) +
labs(x = "Forecast Model", y = "Mean Absolute Error") +
scale_fill_discrete(name = "Forecast Model") +
theme_light() +
scale_fill_brewer(palette="Set2") +
ggtitle("Mean Absolute Error")
# Plot the original dataset and forecasted values
autoplot(ts_hourly_24, main="True Hourly Taxi Trips vs Predicted Values") +
autolayer(fc_arima$mean, series = "ARIMA") +
autolayer(fc_sarima$mean, series="SARIMA_Auto") +
autolayer(fc_sarima_manual$mean, series = "SARIMA_Manual") +
autolayer(fc_ets$mean, series="Exponential Smoothing") +
autolayer(fc_arfima$mean, series="ARFIMA") +
autolayer(fc_fourier$mean, series="Dynamic Harmonic Regression") +
autolayer(fc_tbats$mean, series="TBATS") +
theme_light() +
theme(legend.position="top") +
guides(color=guide_legend(nrow=2, byrow=TRUE)) +
ylab("Taxi Trips")
# Define color palette for the forecast plot
forecast_colors <- c("#0066FF", "#8B4513", "#0000CD", "#FFD700")
# Create a named vector to map series to colors
forecast_color_map <- setNames(forecast_colors, c("SARIMA_Auto", "ARFIMA", "Dynamic Harmonic Regression", "TBATS"))
# Plot the original dataset and forecasted values
autoplot(ts_hourly_24, main = "Hourly Taxi Trips: Actual vs. Forecasted", lwd = 1.3) +
autolayer(fc_sarima$mean, series = "SARIMA_Auto", linetype = "solid", lwd = 0.6) +
autolayer(fc_arfima$mean, series = "ARFIMA", linetype = "solid", lwd = 0.6) +
autolayer(fc_fourier$mean, series = "Dynamic Harmonic Regression", linetype = "solid", lwd = 0.6) +
autolayer(fc_tbats$mean, series = "TBATS", linetype = "solid", lwd = 0.6) +
scale_color_manual(values = forecast_color_map) +
theme_light() +
theme(plot.title = element_text(face = "bold")) +
theme(legend.position = "top", legend.title = element_blank()) +
ylab("Taxi Trips") +
xlim(max(time(ts_hourly_24)) - 7, max(time(ts_hourly_24))) # Set x-axis limits to show only the last five values
autoplot(ts_hourly_24, main="True Hourly Taxi Trips vs Predicted Values") +
autolayer(fc_arima$mean, series = "ARIMA") +
autolayer(fc_sarima$mean, series="SARIMA_Auto") +
autolayer(fc_sarima_manual$mean, series = "SARIMA_Manual") +
autolayer(fc_ets$mean, series="Exponential Smoothing") +
autolayer(fc_arfima$mean, series="ARFIMA") +
autolayer(fc_fourier$mean, series="Dynamic Harmonic Regression") +
autolayer(fc_tbats$mean, series="TBATS") +
scale_color_manual(values = forecast_color_map) +
theme_light() +
theme(legend.position="top") +
guides(color=guide_legend(nrow=2, byrow=TRUE)) +
ylab("Taxi Trips")
#xlim(max(time(ts_hourly_24)) - 7, max(time(ts_hourly_24)))
# Define color palette for the forecast plot
forecast_colors <- c("#FFD700", "#0066FF", "#8B4513", "#0000CD", "#FF9900", "#000066", "#A9A9A9")
# Create a named vector to map series to colors
forecast_color_map <- setNames(forecast_colors, c("SARIMA_Auto", "ARFIMA", "Dynamic Harmonic Regression", "TBATS", "ARIMA", "ETS", "SARIMA_Manual"))
# Plot the original dataset and forecasted values
autoplot(ts_hourly_24, main = "Hourly Taxi Trips: Actual vs. Forecasted") +
autolayer(fc_sarima$mean, series = "SARIMA_Auto") +
autolayer(fc_arfima$mean, series = "ARFIMA") +
autolayer(fc_fourier$mean, series = "Dynamic Harmonic Regression") +
autolayer(fc_tbats$mean, series = "TBATS") +
autolayer(fc_arima$mean, series = "ARIMA") +
autolayer(fc_ets$mean, series = "ETS") +
autolayer(fc_sarima_manual$mean, series = "SARIMA_Manual") +
scale_color_manual(values = forecast_color_map) +
theme_light() +
theme(plot.title = element_text(face = "bold")) +
theme(legend.position = "top", legend.title = element_blank()) +
ylab("Taxi Trips")
#xlim(max(time(ts_hourly_24)) - 7, max(time(ts_hourly_24)))
# Define color palette for the forecast plot
forecast_colors <- c("#FFD700", "#0066FF", "#8B4513", "#0000CD", "#FF9900", "#000066", "#A9A9A9")
# Create a named vector to map series to colors
forecast_color_map <- setNames(forecast_colors, c("SARIMA_Auto", "ARFIMA", "Dynamic Harmonic Regression", "TBATS"))
# Plot the original dataset and forecasted values
autoplot(ts_hourly_24, main = "Monthly Taxi Trips: Actual vs. Forecasted", lwd = 1.3) +
autolayer(fc_sarima$mean, series = "SARIMA_Auto") +
autolayer(fc_arfima$mean, series = "ARFIMA") +
autolayer(fc_fourier$mean, series = "Dynamic Harmonic Regression") +
autolayer(fc_tbats$mean, series = "TBATS") +
#autolayer(fc_arima$mean, series = "ARIMA") +
#autolayer(fc_ets$mean, series = "ETS") +
#autolayer(fc_sarima_manual$mean, series = "SARIMA_Manual") +
scale_color_manual(values = forecast_color_map) +
theme_light() +
theme(plot.title = element_text(face = "bold")) +
theme(legend.position = "top", legend.title = element_blank()) +
ylab("Taxi Trips") +
xlim(max(time(ts_hourly_24)) - 7, max(time(ts_hourly_24)))
# Create a box plot
season <- cycle(ts_hourly_24)
season.factor <- factor(season)
ggplot() +
geom_boxplot(mapping = aes(x = season.factor, y = ts_hourly_24)) +
theme_light() +
labs(x="Hour", y="Trips", title="Box Plot of Hourly Trip Counts by Hour of the Day")